1. IMPORT


library(dada2)
packageVersion("dada2") # check dada2 version
## [1] '1.22.0'
library(Biostrings)
library(ShortRead)
library(seqTools) # per base sequence content
library(phyloseq)
library(ggplot2)
library(data.table)
library(plyr)
library(dplyr)
library(qckitfastq) # per base sequence content
library(stringr)

# ROOT DIRECTORY (to modify on your computer)
path.root <- "~/Projects/MetaIBS"
path.zhu  <- file.path(path.root, "scripts/analysis-individual/Zhu-2019")
path.data <- file.path(path.root, "data/analysis-individual/Zhu-2019")

2. QUALITY CHECK


2.1. Fastq quality profiles

First, we import the fastq files containing the raw reads. The samples were downloaded from the ENA database with the accession number PRJNA566284.

# Save the path to the directory containing the fastq zipped files
path.fastq <- file.path(path.data, "raw_fastq")
# list.files(path.fastq) # check we are in the right directory

# fastq filenames have format: SAMPLENAME.fastq.gz
# Saves the whole directory path to each file name
fnFs <- sort(list.files(path.fastq, pattern="_1.fastq.gz", full.names = TRUE)) # FWD reads
fnRs <- sort(list.files(path.fastq, pattern="_2.fastq.gz", full.names = TRUE)) # REV reads
show(fnFs[1:5])
show(fnRs[1:5])

# Extract sample names, assuming filenames have format: SAMPLENAME.fastq.gz
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
show(sample.names) # saves only the file name (without the path)

# Look at quality of all files
for (i in 1:3){  # 1:length(fnFs)
  show(plotQualityProfile(fnFs[i]))
  show(plotQualityProfile(fnRs[i]))
}

# Look at nb of reads per sample
# raw_stats <- data.frame('sample' = sample.names,
#                         'reads' = fastqq(fnFs)@nReads)
# min(raw_stats$reads)
# max(raw_stats$reads)
# mean(raw_stats$reads)

We will have a quick peak at the per base sequence content of the reads in some samples, to make sure there is no anomaly (i.e. all reads having the same sequence).

# Look at per base sequence content (forward read)
fseqF1 <- seqTools::fastqq(fnFs[10])
## [fastqq] File ( 1/1) '/Users/enigma/Projects/MetaIBS/data/analysis-individual/Zhu-2019/raw_fastq/SRR10142835_1.fastq.gz' done.
rcF1 <- read_content(fseqF1)
plot_read_content(rcF1) + labs(title = "Per base sequence content - Forward read")

fseqR1 <- seqTools::fastqq(fnRs[10])
## [fastqq] File ( 1/1) '/Users/enigma/Projects/MetaIBS/data/analysis-individual/Zhu-2019/raw_fastq/SRR10142835_2.fastq.gz' done.
rcR1 <- read_content(fseqR1)
plot_read_content(rcR1) + labs(title = "Per base sequence content - Reverse read")

2.2. Look for primers

Now, we will look whether the reads still contain the primers. Primer sequences are given in the methods section of the paper.

# V4
FWD <- "GTGCCAGCMGCCGCGGTAA"  # U515 forward primer sequence
REV <- "GGACTACHVGGGTWTCTAAT" # E786 reverse primer sequence (in reality, it's the sequence of 5'-806-786-3')

# Function that, from the primer sequence, will return all combinations possible (complement, reverse complement, ...)
allOrients <- function(primer) {
    # Create all orientations of the input sequence
    require(Biostrings)
    dna <- DNAString(primer)  # The Biostrings works w/ DNAString objects rather than character vectors
    orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna), 
        RevComp = reverseComplement(dna))
    return(sapply(orients, toString))  # Convert back to character vector
}

# Get all combinations of the primer sequences
FWD.orients <- allOrients(FWD) # U515
REV.orients <- allOrients(REV) # E786
FWD.orients # sanity check
##               Forward            Complement               Reverse 
## "GTGCCAGCMGCCGCGGTAA" "CACGGTCGKCGGCGCCATT" "AATGGCGCCGMCGACCGTG" 
##               RevComp 
## "TTACCGCGGCKGCTGGCAC"
REV.orients
##                Forward             Complement                Reverse 
## "GGACTACHVGGGTWTCTAAT" "CCTGATGDBCCCAWAGATTA" "TAATCTWTGGGVHCATCAGG" 
##                RevComp 
## "ATTAGAWACCCBDGTAGTCC"
# Function that counts number of reads in which a sequence is found
primerHits <- function(primer, fn) {
    nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE, max.mismatch = 2)
    return(sum(nhits > 0))
}

# Get a table to know how many times the U515 and E786 primers are found in the reads of each sample
for (i in 6:9){
  cat("SAMPLE", sample.names[i], "with total number of", raw_stats[i,'reads'], "reads\n\n")
  x <- rbind(ForwardRead.FWDPrimer = sapply(FWD.orients, primerHits, fn = fnFs[[i]]),
             ForwardRead.REVPrimer = sapply(REV.orients, primerHits, fn = fnFs[[i]]),
             ReverseRead.FWDPrimer = sapply(FWD.orients, primerHits, fn = fnRs[[i]]), 
             ReverseRead.REVPrimer = sapply(REV.orients, primerHits, fn = fnRs[[i]]))
  print(x)
  cat("\n____________________________________________\n\n")
}
## SAMPLE SRR10142831 with total number of 54075 reads
## 
##                       Forward Complement Reverse RevComp
## ForwardRead.FWDPrimer   53518         18      17      17
## ForwardRead.REVPrimer      17         18      18      16
## ReverseRead.FWDPrimer      20         19      19      27
## ReverseRead.REVPrimer      21         19      19      19
## 
## ____________________________________________
## 
## SAMPLE SRR10142832 with total number of 51364 reads
## 
##                       Forward Complement Reverse RevComp
## ForwardRead.FWDPrimer   50791         13      13      13
## ForwardRead.REVPrimer      13         14      13      13
## ReverseRead.FWDPrimer      15         17      14      17
## ReverseRead.REVPrimer      15         15      14      15
## 
## ____________________________________________
## 
## SAMPLE SRR10142833 with total number of 32638 reads
## 
##                       Forward Complement Reverse RevComp
## ForwardRead.FWDPrimer   32569          0       0       0
## ForwardRead.REVPrimer       0          0       0       0
## ReverseRead.FWDPrimer       0          0       0   28114
## ReverseRead.REVPrimer      21          0       0       0
## 
## ____________________________________________
## 
## SAMPLE SRR10142834 with total number of 38514 reads
## 
##                       Forward Complement Reverse RevComp
## ForwardRead.FWDPrimer   38442          0       0       0
## ForwardRead.REVPrimer       0          0       0       0
## ReverseRead.FWDPrimer       0          0       0   34209
## ReverseRead.REVPrimer      34          0       0       0
## 
## ____________________________________________

Let’s have a quick look at where primers are positioned in the forward/reverse reads

# Function that gets position in which sequence is found
primerHitsPosition <- function(primer, fn){
  hits <- as.data.frame(vmatchPattern(primer, sread(readFastq(fn)), fixed = FALSE, max.mismatch = 2))
  hits <- hits[,c("group", "start")]
  colnames(hits) <- c("sample", "start")
  hits$sample <- sapply(strsplit(basename(fn), "_"), `[`, 1)
  hits$readslength <- seqTools::fastqq(fn)@maxSeqLen
  return(hits)
}

# Get position of primers in forward reads
FWDpos.FWDread <- data.frame()
for(i in 1:length(fnFs)){
  cat("SAMPLE", i)
  newF <- primerHitsPosition(FWD.orients["Forward"], fnFs[[i]])
  FWDpos.FWDread <- rbind(newF, FWDpos.FWDread)
}

# Get position of REV primers
FWDpos.REVread <- data.frame()
for(i in 1:length(fnRs)){
  cat("SAMPLE", i)
  newF <- primerHitsPosition(FWD.orients["RevComp"], fnRs[[i]])
  FWDpos.REVread <- rbind(newF, FWDpos.REVread)
}
# add metadata info on samples
metadata_table <- read.csv(file.path(path.data, "00_Metadata-Zhu/Metadata-Zhu.csv"), row.names=1)
FWDpos.REVread <- FWDpos.REVread %>%
  left_join(metadata_table[,c("Run", "host_disease")], by=join_by("sample"=="Run"))
# FORWARD READS
ggplot(FWDpos.FWDread, aes(x=start))+
  geom_density() +
  xlim(c(0,max(FWDpos.FWDread$readslength)))+
  labs(x="start position of primer", y="density of primers starting at x position", title="FORWARD READS")

# REVERSE READS
ggplot(FWDpos.REVread, aes(x=start))+
  geom_density() +
  facet_wrap(~sample)+ # weird pattern...
  xlim(c(0,max(FWDpos.REVread$readslength)))+
  labs(x="start position of primer", y="density of primers starting at x position", title="REVERSE READS")

ggplot(FWDpos.REVread, aes(x=start))+
  geom_density() +
  facet_wrap(~host_disease)+ # seems like it's determined by host_disease...
  xlim(c(0,max(FWDpos.REVread$readslength)))+
  labs(x="start position of primer", y="density of primers starting at x position", title="REVERSE READS")


3. FILTER AND TRIM


3.1. Primer removal

The forward reads indeed contain the forward primer (in the middle of the reads), and the reverse reads contain the revcomp of the forward primer (which makes sense, it shows the forward primer is found at the end of the reverse reads). But only half the samples contain a primer in the reverse reads (seems like it’s the healthy samples). Ideally, to follow the standardized pipeline, we should (1) filter out reads not containing any primer and (2) trim the primers. However, we would lose all of our IBS samples in step 1. Thus, we will simply perform a quality filtering of the reads (step 2).

3.2. Quality filtering

Then, we perform a quality filtering of the reads.

# Place filtered files in a filtered/ subdirectory
FWD.filt2_samples <- file.path(path.data, "filtered2", paste0(sample.names, "_1_filt2.fastq.gz")) # FWD reads
REV.filt2_samples <- file.path(path.data, "filtered2", paste0(sample.names, "_2_filt2.fastq.gz")) # REV reads
# Assign names for the filtered fastq.gz files
names(FWD.filt2_samples) <- sample.names
names(REV.filt2_samples) <- sample.names

# Filter
out2 <- filterAndTrim(fwd = fnFs, filt = FWD.filt2_samples,
                      rev = fnRs, filt.rev = REV.filt2_samples,
                      maxEE=3, # reads with more than 3 expected errors (sum(10e(-Q/10))) are discarded
                      truncQ=10, # Truncate reads at the first instance of a quality score less than or equal to truncQ.
                      minLen = 150, # Discard reads shorter than 150 bp. This is done after trimming and truncation.
                      compress=TRUE, multithread = TRUE, verbose=TRUE)

Let’s look at the output filtered fastq files as sanity check.

out2[1:6,] # show how many reads were filtered in each file
##                        reads.in reads.out
## SRR10142826_1.fastq.gz    51061     48240
## SRR10142827_1.fastq.gz    55164     52238
## SRR10142828_1.fastq.gz    53155     50320
## SRR10142829_1.fastq.gz    51938     48538
## SRR10142830_1.fastq.gz    52139     49449
## SRR10142831_1.fastq.gz    54075     51228
# Look at quality profile of all filtered files
for (i in 1:3){
  show(plotQualityProfile(FWD.filt2_samples[i]))
  show(plotQualityProfile(REV.filt2_samples[i]))
}


4. CONSTRUCT ASV TABLE


4.1. Learn error rates

Now we will build the parametric error model, to be able to infer amplicon sequence variants (ASVs) later on.

set.seed(123)
errF <- learnErrors(FWD.filt2_samples, multithread=TRUE, randomize=TRUE, verbose = 1)
set.seed(123)
errR <- learnErrors(REV.filt2_samples, multithread=TRUE, randomize=TRUE, verbose = 1)

The error rates for each possible transition (A→C, A→G, …) are shown. Points are the observed error rates for each consensus quality score. The black line shows the estimated error rates after convergence of the machine-learning algorithm. The red line shows the error rates expected under the nominal definition of the Q-score. Here the estimated error rates (black line) are a good fit to the observed rates (points), and the error rates drop with increased quality as expected.

plotErrors(errF, nominalQ = TRUE) # Forward reads

plotErrors(errR, nominalQ = TRUE) # Reverse reads

4.2. Infer sample composition

The dada() algorithm infers sequence variants based on estimated errors (previous step). Firstly, we de-replicate the reads in each sample, to reduce the computation time. De-replication is a common step in almost all modern ASV inference (or OTU picking) pipelines, but a unique feature of derepFastq is that it maintains a summary of the quality information for each dereplicated sequence in $quals.

# Dereplicate the reads in the sample
derepF <- derepFastq(FWD.filt2_samples) # forward
derepR <- derepFastq(REV.filt2_samples) # reverse

# Infer sequence variants
dadaFs <- dada(derepF, err=errF, multithread=TRUE) # forward
dadaRs <- dada(derepR, err=errR, multithread=TRUE) # reverse
# Inspect the infered sequence variants from sample 1:3
for (i in 1:3){
  print(dadaFs[[i]])
  print(dadaRs[[i]])
  print("________________")
}
## dada-class: object describing DADA2 denoising results
## 168 sequence variants were inferred from 15778 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## dada-class: object describing DADA2 denoising results
## 201 sequence variants were inferred from 12570 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 224 sequence variants were inferred from 16783 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## dada-class: object describing DADA2 denoising results
## 238 sequence variants were inferred from 14228 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 214 sequence variants were inferred from 18811 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## dada-class: object describing DADA2 denoising results
## 254 sequence variants were inferred from 15475 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"

4.3. Merge paired-end reads

We now need to merge paired reads.

mergers <- mergePairs(dadaFs, derepF, dadaRs, derepR, verbose=TRUE)
head(mergers[[1]])

4.4. Construct ASV table

We can now construct an amplicon sequence variant table (ASV) table, a higher-resolution version of the OTU table produced by traditional methods.

# Make sequence table from the infered sequence variants
seqtable <- makeSequenceTable(mergers)

# We should have 29 samples (29 rows)
dim(seqtable)
## [1]   29 5307
# Inspect distribution of sequence lengths
hist(nchar(getSequences(seqtable)), breaks = 100, xlab = "ASV length", ylab = "Number of ASVs", main="")

The V4 region should be ~250bp, and our ASVs are rather ~400bp long. Considering that the forward primer was found in the middle of the forward reads, it is possible that we have much longer ASVs than we should (pre-V4 region + V4 region). Let’s try trimming the ASVs until the end of the forward primer.

# Get the ASVs
asvs <- getSequences(seqtable)

# Trim the ASVs when they contain the forward primer.
# FWD primer: GTGCCAGCMGCCGCGGTAA (M: A or C)
new.asvs <- sub(".*GTGCCAGCAGCCGCGGTAA|.*GTGCCAGCCGCCGCGGTAA", "", getSequences(seqtable))
length(new.asvs) == length(asvs) # sanity check
## [1] TRUE
# Let's check the length of our new asvs
hist(nchar(new.asvs), breaks = 100, xlab = "ASV length", ylab = "Number of ASVs", main="")

# Put these new ASVs in a separate seqtable
seqtable.new <- seqtable
colnames(seqtable.new) <- new.asvs

# Sanity check
# for (i in 1:length(asvs)){
#   test <- str_detect(string = getSequences(seqtable)[i],
#                      pattern = getSequences(seqtable.new)[i])
#   if(test==FALSE){print('FALSE')}
# }

# Is there any duplicate ASVs now?
table(duplicated(new.asvs))
## 
## FALSE  TRUE 
##  1651  3656
table(duplicated(asvs))
## 
## FALSE 
##  5307
# Create table to have corresponding ASVs pre/post-cut (to compare taxonomy assignment later on)
asv.df <- data.frame("asv" = paste0("ASV", 1:length(asvs)),
                     "pre_cut" = asvs,
                     "post_cut" = new.asvs)

# We will merge columns with same ASV and sum their counts
seqtable.small <- sapply(unique(colnames(seqtable.new)),
                         function(i) as.integer(rowSums(as.data.frame(seqtable.new[,colnames(seqtable.new) == i]))))
rownames(seqtable.small) <- rownames(seqtable.new) # add sample names as rownames

# sanity checks
ncol(seqtable.small) == length(unique(new.asvs))
## [1] TRUE
rowSums(seqtable.new) == rowSums(seqtable.small)
## SRR10142826 SRR10142827 SRR10142828 SRR10142829 SRR10142830 SRR10142831 
##        TRUE        TRUE        TRUE        TRUE        TRUE        TRUE 
## SRR10142832 SRR10142833 SRR10142834 SRR10142835 SRR10142836 SRR10142837 
##        TRUE        TRUE        TRUE        TRUE        TRUE        TRUE 
## SRR10142838 SRR10142839 SRR10142840 SRR10142841 SRR10142842 SRR10142843 
##        TRUE        TRUE        TRUE        TRUE        TRUE        TRUE 
## SRR10142844 SRR10142845 SRR10142846 SRR10142847 SRR10142848 SRR10142849 
##        TRUE        TRUE        TRUE        TRUE        TRUE        TRUE 
## SRR10142850 SRR10142851 SRR10142852 SRR10142853 SRR10142854 
##        TRUE        TRUE        TRUE        TRUE        TRUE
sum(seqtable.new) == sum(seqtable.small)
## [1] TRUE
# Check the ASVs are same sequence length, but we should have less of them
hist(nchar(getSequences(seqtable.small)), breaks = 100, xlab = "ASV length", ylab = "Number of ASVs", main="")

# Sequences should be between 515F - 806R, so around ~250bp (removing the primers lengths).
# Remove any sequence variant outside 200:300bp
seqtable.small.new <- seqtable.small[,nchar(colnames(seqtable.small)) %in% 200:300]
dim(seqtable.small.new)
## [1]   29 1631
hist(nchar(getSequences(seqtable.small.new)), breaks = 100, xlab = "ASV length", ylab = "Number of ASVs", main="")

# check we haven't lost too many counts
sum(seqtable.small.new)/sum(seqtable.small)
## [1] 0.9996091

4.5. Remove chimeras

The core dada method corrects substitution and indel errors, but chimeras remain. Fortunately, the accuracy of sequence variants after denoising makes identifying chimeric ASVs simpler than when dealing with fuzzy OTUs. Chimeric sequences are identified if they can be exactly reconstructed by combining a left-segment and a right-segment from two more abundant “parent” sequences.

seqtable.nochim <- removeBimeraDenovo(seqtable, method="consensus", multithread=TRUE, verbose=TRUE)

# Check how many sequence variants we have after removing chimeras
dim(seqtable.nochim)
## [1]   29 1057
# Check how many reads we have after removing chimeras (we should keep the vast majority of the reads, like > 80%)
sum(seqtable.nochim)/sum(seqtable)
## [1] 0.8911222
# Same for the table with ASVs where I cut the FWD primer
seqtable.small.nochim <- removeBimeraDenovo(seqtable.small.new, method="consensus", multithread=TRUE, verbose=TRUE)
dim(seqtable.small.nochim)
## [1]  29 672
sum(seqtable.small.nochim)/sum(seqtable.small.new)
## [1] 0.9482145

5. LOOK AT READS COUNT THROUGH PIPELINE


Sanity check before assigning taxonomy.

# Function that counts nb of reads
getN <- function(x) sum(getUniques(x))

# Table that will count number of reads for each process of interest (input reads, filtered reads, denoised reads, non chimera reads)
track <- cbind(out2,
               sapply(dadaFs, getN),
               sapply(dadaRs, getN),
               sapply(mergers, getN),
               rowSums(seqtable.small.nochim),
               lapply(rowSums(seqtable.small.nochim)*100/out2[,1], as.integer))

# Assign column and row names
colnames(track) <- c("input", "quality-filt", "denoisedF", "denoisedR", 'merged', 'nonchim', "%input->output")
rownames(track) <- sample.names

# Show final table: for each row/sample, we have shown the initial number of reads, filtered reads, denoised reads, and non chimera reads
track
##             input quality-filt denoisedF denoisedR merged nonchim
## SRR10142826 51061 48240        47624     47751     25767  25496  
## SRR10142827 55164 52238        51769     51784     36666  36366  
## SRR10142828 53155 50320        49775     49726     40494  38961  
## SRR10142829 51938 48538        48215     48173     13442  12432  
## SRR10142830 52139 49449        49039     48994     34396  32915  
## SRR10142831 54075 51228        50820     50620     41524  39983  
## SRR10142832 51364 48822        48383     48260     42524  40725  
## SRR10142833 32638 21143        20043     20537     14753  13943  
## SRR10142834 38514 25140        24640     24742     20454  18618  
## SRR10142835 42417 26199        24909     25675     17136  15769  
## SRR10142836 45595 28699        26731     27942     16544  15113  
## SRR10142837 35822 21822        20868     21415     15855  15169  
## SRR10142838 34463 21734        20525     21150     14414  12195  
## SRR10142839 41761 27285        26402     26830     23167  18496  
## SRR10142840 32501 19495        17998     18955     9727   9474   
## SRR10142841 22694 14641        14175     14345     10588  9850   
## SRR10142842 50554 48003        47648     47460     42029  39592  
## SRR10142843 21956 14145        13244     13634     9698   8307   
## SRR10142844 35708 23147        22198     22498     15607  13699  
## SRR10142845 29069 19484        19061     19254     14988  14213  
## SRR10142846 44981 28703        28290     28365     20580  19883  
## SRR10142847 33463 21424        19826     20739     11418  10495  
## SRR10142848 51762 49268        48953     48824     41034  40064  
## SRR10142849 54523 51877        51382     51194     45208  43108  
## SRR10142850 52637 50266        49831     49702     43957  40755  
## SRR10142851 54179 51760        51303     51112     45147  43067  
## SRR10142852 54060 51373        51001     50855     43330  42022  
## SRR10142853 54531 51848        51235     51245     42970  41752  
## SRR10142854 54434 51456        51117     50951     42117  41580  
##             %input->output
## SRR10142826 49            
## SRR10142827 65            
## SRR10142828 73            
## SRR10142829 23            
## SRR10142830 63            
## SRR10142831 73            
## SRR10142832 79            
## SRR10142833 42            
## SRR10142834 48            
## SRR10142835 37            
## SRR10142836 33            
## SRR10142837 42            
## SRR10142838 35            
## SRR10142839 44            
## SRR10142840 29            
## SRR10142841 43            
## SRR10142842 78            
## SRR10142843 37            
## SRR10142844 38            
## SRR10142845 48            
## SRR10142846 44            
## SRR10142847 31            
## SRR10142848 77            
## SRR10142849 79            
## SRR10142850 77            
## SRR10142851 79            
## SRR10142852 77            
## SRR10142853 76            
## SRR10142854 76

6. TAXONOMIC TABLE


6.1. Assign taxonomy

Extensions: The dada2 package also implements a method to make species level assignments based on exact matching between ASVs and sequenced reference strains. Recent analysis suggests that exact matching (or 100% identity) is the only appropriate way to assign species to 16S gene fragments. Currently, species-assignment training fastas are available for the Silva and RDP 16S databases. To follow the optional species addition step, download the silva_species_assignment_v132.fa.gz file, and place it in the directory with the fastq files.

path.silva <- file.path(path.root, "data/analysis-individual/CLUSTER/taxonomy/silva-taxonomic-ref")

# Assign taxonomy (with silva v138)
taxa <- assignTaxonomy(seqtable.nochim, file.path(path.silva, "silva_nr99_v138.1_train_set.fa.gz"),
                       tryRC = TRUE, # try reverse complement of the sequences
                       multithread=TRUE, verbose = TRUE)
# Add species assignment
taxa <- addSpecies(taxa, file.path(path.silva, "silva_species_assignment_v138.1.fa.gz"))


# Same with our optimized seqtable
taxa.small <- assignTaxonomy(seqtable.small.nochim, file.path(path.silva, "silva_nr99_v138.1_train_set.fa.gz"),
                            tryRC = TRUE, # try reverse complement of the sequences
                            multithread=TRUE, verbose = TRUE)
taxa.small <- addSpecies(taxa.small, file.path(path.silva, "silva_species_assignment_v138.1.fa.gz"))
# Check how the taxonomy table looks like
taxa.print <- taxa
rownames(taxa.print) <- NULL # Removing sequence rownames for display only
head(taxa.print)
##      Kingdom    Phylum         Class        
## [1,] "Bacteria" "Firmicutes"   "Clostridia" 
## [2,] "Bacteria" "Firmicutes"   "Clostridia" 
## [3,] "Bacteria" "Firmicutes"   "Clostridia" 
## [4,] "Bacteria" "Firmicutes"   "Clostridia" 
## [5,] "Bacteria" "Firmicutes"   "Clostridia" 
## [6,] "Bacteria" "Bacteroidota" "Bacteroidia"
##      Order                                 Family                 
## [1,] "Peptostreptococcales-Tissierellales" "Peptostreptococcaceae"
## [2,] "Lachnospirales"                      "Lachnospiraceae"      
## [3,] "Clostridiales"                       "Clostridiaceae"       
## [4,] "Peptostreptococcales-Tissierellales" "Peptostreptococcaceae"
## [5,] "Lachnospirales"                      "Lachnospiraceae"      
## [6,] "Bacteroidales"                       "Bacteroidaceae"       
##      Genus                         Species     
## [1,] "Intestinibacter"             "bartlettii"
## [2,] "Blautia"                     NA          
## [3,] "Clostridium sensu stricto 1" NA          
## [4,] "Romboutsia"                  NA          
## [5,] "[Ruminococcus] gnavus group" NA          
## [6,] "Bacteroides"                 "plebeius"
table(taxa.print[,1], useNA="ifany") # Show the different kingdoms
## 
## Bacteria     <NA> 
##     1056        1
table(taxa.print[,2], useNA="ifany") # Show the different phyla
## 
##             Actinobacteriota                 Bacteroidota 
##                           55                          333 
##                Cyanobacteria                 Deinococcota 
##                            1                            1 
##             Desulfobacterota                   Firmicutes 
##                            2                          553 
##               Fusobacteriota              Patescibacteria 
##                           11                            2 
##               Proteobacteria SAR324 clade(Marine group B) 
##                           80                            1 
##                 Synergistota            Verrucomicrobiota 
##                            1                            3 
##                         <NA> 
##                           14
# Check how the taxonomy table looks like (where we trimmed the ASV sequences)
taxa.small.print <- taxa.small
rownames(taxa.small.print) <- NULL
head(taxa.small.print)
##      Kingdom    Phylum         Class        
## [1,] "Bacteria" "Firmicutes"   "Clostridia" 
## [2,] "Bacteria" "Firmicutes"   "Clostridia" 
## [3,] "Bacteria" "Firmicutes"   "Clostridia" 
## [4,] "Bacteria" "Firmicutes"   "Clostridia" 
## [5,] "Bacteria" "Firmicutes"   "Clostridia" 
## [6,] "Bacteria" "Bacteroidota" "Bacteroidia"
##      Order                                 Family                 
## [1,] "Peptostreptococcales-Tissierellales" "Peptostreptococcaceae"
## [2,] "Lachnospirales"                      "Lachnospiraceae"      
## [3,] "Clostridiales"                       "Clostridiaceae"       
## [4,] "Peptostreptococcales-Tissierellales" "Peptostreptococcaceae"
## [5,] "Lachnospirales"                      "Lachnospiraceae"      
## [6,] "Bacteroidales"                       "Bacteroidaceae"       
##      Genus                         Species     
## [1,] "Intestinibacter"             "bartlettii"
## [2,] "Blautia"                     NA          
## [3,] "Clostridium sensu stricto 1" NA          
## [4,] "Romboutsia"                  NA          
## [5,] "[Ruminococcus] gnavus group" NA          
## [6,] "Bacteroides"                 "plebeius"
table(taxa.small.print[,1], useNA="ifany") # Show the different kingdoms
## 
## Bacteria 
##      672
table(taxa.small.print[,2], useNA="ifany") # Show the different phyla
## 
##             Actinobacteriota                 Bacteroidota 
##                           33                          157 
##                Cyanobacteria                 Deinococcota 
##                            1                            1 
##             Desulfobacterota                   Firmicutes 
##                            2                          430 
##               Fusobacteriota               Proteobacteria 
##                            8                           36 
## SAR324 clade(Marine group B)                 Synergistota 
##                            1                            1 
##            Verrucomicrobiota 
##                            2

6.2. Compare taxonomic assignment with/without cutting ASVs

# taxa dataframes
taxa.df <- data.frame(taxa)
taxa.df$asv <- rownames(taxa.df)
taxa.small.df <- data.frame(taxa.small)
taxa.small.df$asv <- rownames(taxa.small.df)

# remove chimera from asv pre/post cut dataframe
asv.df.nochim <- asv.df[asv.df$pre_cut %in% rownames(taxa.df),]
colnames(asv.df.nochim) <- c("asvID", "asv_precut", "asv_postcut")
# Sanity checks
# table(duplicated(asv.df.nochim$asv_postcut))
# dim(asv.df.nochim) # should be 1057
# table(asv.df.nochim$asv_precut %in% rownames(taxa))
# table(asv.df.nochim$asv_postcut %in% rownames(taxa.small)) # 56 ASVs post-cut are not in the taxonomic table (probably because they are chimeras)

# Compare taxonomic assignment pre/post-cut (phylum level)
compare.df <- merge(asv.df.nochim, taxa.df[,c("Phylum", "asv")], by.x="asv_precut", by.y="asv") # pre-cut phylum
colnames(compare.df)[4] <- "Phylum_precut"
compare.df <- merge(x=compare.df, y=taxa.small.df[,c("Phylum", "asv")], by.x="asv_postcut", by.y="asv") # post-cut phylum
colnames(compare.df)[5] <- "Phylum_postcut"

# The 56 ASVs present in pre-cut but not in post-cut were removed (59 rows were deleted)
dim(compare.df)
## [1] 1001    5
# How many rows (ASVs) have different phylum?
table(compare.df$Phylum_precut == compare.df$Phylum_postcut, useNA="ifany") # 26 ASVs assigned to different phyla
## 
## FALSE  TRUE  <NA> 
##    26   973     2
phylum.all <- compare.df %>%
  select(asvID, Phylum_precut, Phylum_postcut) %>%
  dplyr::rename(precut=Phylum_precut, postcut=Phylum_postcut) %>%
  mutate(diff=ifelse(precut != postcut, TRUE, FALSE)) %>%
  tidyr::pivot_longer(!c(asvID, diff), names_to="cut", values_to="Phylum")
  

# Plot
library(ggalluvial)
ggplot(phylum.all,
       aes(x = cut, stratum = Phylum, alluvium = asvID,
           label = Phylum)) +
  geom_flow(aes(color=diff),stat = "alluvium", lode.guidance = "frontback") +
  geom_stratum(aes(fill=Phylum)) +
  scale_color_manual(values=c("lightgrey", "#de2d26", "lightgrey"))+
  scale_fill_brewer(type = "qual", palette = "Set3") +
  theme_bw()+
  labs(x="", y="#ASVs", title="ASV taxonomic assignment (with/without cutting)", color='Different?')

# ggsave(file.path(path.data, "01_Dada2-Zhu/pre-post-cut_taxAssignment_phylum.jpg"), width=10, height=6)


# *********************
# SAME AT GENUS LEVEL
compare.df.genus <- merge(asv.df.nochim, taxa.df[,c("Genus", "asv")], by.x="asv_precut", by.y="asv") # pre-cut genus
colnames(compare.df.genus)[4] <- "Genus_precut"
compare.df.genus <- merge(x=compare.df.genus, y=taxa.small.df[,c("Genus", "asv")], by.x="asv_postcut", by.y="asv") # post-cut genus
colnames(compare.df.genus)[5] <- "Genus_postcut"
dim(compare.df.genus)
## [1] 1001    5
# How many rows (ASVs) have different phylum?
table(compare.df.genus$Genus_precut == compare.df.genus$Genus_postcut, useNA="ifany") # 31 ASVs assigned to diff. genus, but many more NAs
## 
## FALSE  TRUE  <NA> 
##    31   816   154
genus.all <- compare.df.genus %>%
  select(asvID, Genus_precut, Genus_postcut) %>%
  dplyr::rename(precut=Genus_precut, postcut=Genus_postcut) %>%
  mutate(diff=ifelse(precut != postcut, TRUE, FALSE)) %>%
  tidyr::pivot_longer(!c(asvID, diff), names_to="cut", values_to="Genus")


# Plot
ggplot(genus.all,
       aes(x = cut, stratum = Genus, alluvium = asvID,
           label = Genus)) +
  geom_flow(aes(color=diff),stat = "alluvium", lode.guidance = "frontback") +
  geom_stratum(aes(fill=Genus)) +
  scale_color_manual(values=c("lightgrey", "#de2d26", "lightgrey"))+
  theme_bw()+
  theme(legend.text = element_text(size=3),
        legend.key.size = unit(0.1, "cm"))+
  labs(x="", y="#ASVs", title="ASV taxonomic assignment (with/without cutting)", color='Different?')+
  guides(fill=guide_legend(ncol=3))

# ggsave(file.path(path.data, "01_Dada2-Zhu/pre-post-cut_taxAssignment_genus.jpg"), width=10, height=6)

7. LAST PREPROCESSING


We will remove any sample with less than 500 reads from further analysis, and also any ASVs with unassigned phyla.

7.1. Create phyloseq object

The preprocessing will be easier to do with ASV, taxonomic and metadata tables combined in a phyloseq object.

#_________________________
# Import metadata
metadata_table <- read.csv(file.path(path.data, "00_Metadata-Zhu/Metadata-Zhu.csv"), row.names=1)

#_________________________
# Create phyloseq object
physeq <- phyloseq(otu_table(seqtable.nochim, taxa_are_rows=FALSE), # by default, in otu_table the ASVs are in rows
                  sample_data(metadata_table), 
                  tax_table(taxa))
physeq.small <- phyloseq(otu_table(seqtable.small.nochim, taxa_are_rows=FALSE),
                  sample_data(metadata_table), 
                  tax_table(taxa.small))

# Remove taxa that are eukaryota, or have unassigned Phyla
physeq <- subset_taxa(physeq, !is.na(Kingdom))
physeq <- subset_taxa(physeq, !is.na(Phylum))
# Remove samples with less than 500 reads
physeq <- prune_samples(sample_sums(physeq)>=500, physeq)
# No sample was deleted, so we don't need to remove taxa present in low-count samples
# physeq <- prune_taxa(taxa_sums(physeq)>0, physeq)

# Remove taxa that are eukaryota, or have unassigned Phyla
physeq.small <- subset_taxa(physeq.small, !is.na(Kingdom))
physeq.small <- subset_taxa(physeq.small, !is.na(Phylum))
# Remove samples with less than 500 reads
physeq.small <- prune_samples(sample_sums(physeq.small)>=500, physeq.small)

7.2. Quick peek at data analysis

# Absolute abundance
# plot_bar(physeq, fill = "Phylum")+ facet_wrap("host_disease", scales="free_x") + theme(axis.text.x = element_blank())

# Relative abundance for Phylum
phylum.table <- physeq %>%
  tax_glom(taxrank = "Phylum") %>%                     # agglomerate at phylum level
  transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
  psmelt()                                             # Melt to long format

ggplot(phylum.table, aes(x = Sample, y = Abundance, fill = Phylum))+
  facet_wrap(~ host_disease, scales = "free") + # scales = "free" removes empty lines
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(size = 5, angle = -90))+
  labs(x = "Samples", y = "Relative abundance")

# Absolute abundance
# plot_bar(physeq.small, fill = "Phylum")+ facet_wrap("host_disease", scales="free_x") + theme(axis.text.x = element_blank())

# Relative abundance for Phylum
phylum.table <- physeq.small %>%
  tax_glom(taxrank = "Phylum") %>%                     # agglomerate at phylum level
  transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
  psmelt()                                             # Melt to long format

ggplot(phylum.table, aes(x = Sample, y = Abundance, fill = Phylum))+
  facet_wrap(~ host_disease, scales = "free") + # scales = "free" removes empty lines
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(size = 5, angle = -90))+
  labs(x = "Samples", y = "Relative abundance")

For any further analysis, we will be doing it on the physeq_shortASV object, as the ASVs sequences better reflect the V4 variable region.

7.3. Save to disk

# Save to disk
saveRDS(raw_stats,      file.path(path.data, "01_Dada2-Zhu/raw_stats.rds"))
saveRDS(FWDpos.FWDread, file.path(path.data, "01_Dada2-Zhu/forwardreads_primerposition.rds"))
saveRDS(FWDpos.REVread, file.path(path.data, "01_Dada2-Zhu/reversereads_primerposition.rds"))
saveRDS(out2,           file.path(path.data, "01_Dada2-Zhu/out2.rds"))
saveRDS(errF,           file.path(path.data, "01_Dada2-Zhu/errF.rds"))
saveRDS(errR,           file.path(path.data, "01_Dada2-Zhu/errR.rds"))
saveRDS(dadaFs,         file.path(path.data, "01_Dada2-Zhu/infered_seq_F.rds"))
saveRDS(dadaRs,         file.path(path.data, "01_Dada2-Zhu/infered_seq_R.rds"))
saveRDS(mergers,        file.path(path.data, "01_Dada2-Zhu/mergers.rds"))


# Taxa & Phyloseq object
saveRDS(taxa,       file.path(path.data, "01_Dada2-Zhu/taxa_zhu_longASV.rds"))
saveRDS(taxa.small, file.path(path.data, "01_Dada2-Zhu/taxa_zhu_shortASV.rds"))

saveRDS(physeq,       file.path(path.data, "01_Dada2-Zhu/physeq_zhu_longASV.rds"))
saveRDS(physeq.small, file.path(path.root, "data/analysis-individual/CLUSTER/PhyloTree/input/physeq_zhu.rds")) # phyloseq object with shorter ASVs
saveRDS(physeq.small, file.path(path.root, "data/phyloseq-objects/phyloseq-without-phylotree/physeq_zhu.rds"))

8. SESSION INFO


sessionInfo()
## R version 4.1.3 (2022-03-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggalluvial_0.12.5           stringr_1.5.0              
##  [3] qckitfastq_1.10.0           dplyr_1.1.1                
##  [5] plyr_1.8.8                  data.table_1.14.8          
##  [7] ggplot2_3.4.2               phyloseq_1.38.0            
##  [9] seqTools_1.28.0             zlibbioc_1.40.0            
## [11] ShortRead_1.52.0            GenomicAlignments_1.30.0   
## [13] SummarizedExperiment_1.24.0 Biobase_2.54.0             
## [15] MatrixGenerics_1.6.0        matrixStats_0.63.0         
## [17] Rsamtools_2.10.0            GenomicRanges_1.46.1       
## [19] BiocParallel_1.28.3         Biostrings_2.62.0          
## [21] GenomeInfoDb_1.30.1         XVector_0.34.0             
## [23] IRanges_2.28.0              S4Vectors_0.32.4           
## [25] BiocGenerics_0.40.0         dada2_1.22.0               
## [27] Rcpp_1.0.10                
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-162           bitops_1.0-7           RColorBrewer_1.1-3    
##  [4] tools_4.1.3            bslib_0.4.2            utf8_1.2.3            
##  [7] R6_2.5.1               vegan_2.6-4            mgcv_1.8-42           
## [10] DBI_1.1.3              colorspace_2.1-0       permute_0.9-7         
## [13] rhdf5filters_1.6.0     ade4_1.7-22            withr_2.5.0           
## [16] tidyselect_1.2.0       compiler_4.1.3         cli_3.6.1             
## [19] DelayedArray_0.20.0    labeling_0.4.2         sass_0.4.5            
## [22] scales_1.2.1           RSeqAn_1.14.0          digest_0.6.31         
## [25] rmarkdown_2.21         jpeg_0.1-10            pkgconfig_2.0.3       
## [28] htmltools_0.5.5        highr_0.10             fastmap_1.1.1         
## [31] rlang_1.1.0            rstudioapi_0.14        farver_2.1.1          
## [34] jquerylib_0.1.4        generics_0.1.3         hwriter_1.3.2.1       
## [37] jsonlite_1.8.4         RCurl_1.98-1.12        magrittr_2.0.3        
## [40] GenomeInfoDbData_1.2.7 biomformat_1.22.0      interp_1.1-4          
## [43] Matrix_1.5-1           munsell_0.5.0          Rhdf5lib_1.16.0       
## [46] fansi_1.0.4            ape_5.7-1              lifecycle_1.0.3       
## [49] stringi_1.7.12         yaml_2.3.7             MASS_7.3-58.3         
## [52] rhdf5_2.38.1           grid_4.1.3             parallel_4.1.3        
## [55] crayon_1.5.2           deldir_1.0-6           lattice_0.20-45       
## [58] splines_4.1.3          multtest_2.50.0        knitr_1.42            
## [61] pillar_1.9.0           igraph_1.4.2           reshape2_1.4.4        
## [64] codetools_0.2-19       glue_1.6.2             evaluate_0.20         
## [67] latticeExtra_0.6-30    RcppParallel_5.1.7     png_0.1-8             
## [70] vctrs_0.6.1            foreach_1.5.2          purrr_1.0.1           
## [73] tidyr_1.3.0            gtable_0.3.3           cachem_1.0.7          
## [76] xfun_0.38              survival_3.5-5         tibble_3.2.1          
## [79] iterators_1.0.14       cluster_2.1.4